library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.6 v dplyr 1.0.8
## v tidyr 1.2.0 v stringr 1.4.0
## v readr 2.1.2 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(openxlsx)
library(dplyr)
library(zoo)
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
url.data <- "https://data.cdc.gov/api/views/9mfq-cb36/rows.csv?accessType=DOWNLOAD"
raw <-read.csv(url.data)
raw
lets test this?
sort(unique(raw$state))
## [1] "AK" "AL" "AR" "AS" "AZ" "CA" "CO" "CT" "DC" "DE" "FL" "FSM"
## [13] "GA" "GU" "HI" "IA" "ID" "IL" "IN" "KS" "KY" "LA" "MA" "MD"
## [25] "ME" "MI" "MN" "MO" "MP" "MS" "MT" "NC" "ND" "NE" "NH" "NJ"
## [37] "NM" "NV" "NY" "NYC" "OH" "OK" "OR" "PA" "PR" "PW" "RI" "RMI"
## [49] "SC" "SD" "TN" "TX" "UT" "VA" "VI" "VT" "WA" "WI" "WV" "WY"
So firstly FSM is a small island Nation, NYC is New York City, which should be combined with New York State. In addition not all states are included in this list, indicating that not all states provided data. Let’s see what happens when we look into the data some more. Just going off the back of my hand and a quick skim of the data list, “AS” “NYC” “PR” “PW” “RI” “RMI” “FSM” “MP” “VI” all appear to be not valid options. First things first, I want to create a new dataframe linking Created_at and submission date, mostly because I am interested in seeing it.
deltaDate <- subset(raw, select = c(submission_date,created_at) )
deltaDate$date_diff <- as.Date(as.character(deltaDate$submission_date), format="%m/%d/%Y")-
as.Date(as.character(deltaDate$created_at), format="%m/%d/%Y %H:%M:%S")
deltaDate <- deltaDate[order(deltaDate$date_diff),]
deltaDate
write.csv(deltaDate,"Covid_data_reporting_delta.csv", row.names = FALSE)
Firstly, there are more entries than there are actual states. At just a first glance over, NYC should be incorporated into NY as its a city in a state. “AK” “AL” “AR” “AZ” “CA” “CO” “CT” “DC” “DE” “FL” “GA” “HI” “IA” “ID” “IL” “IN” “KY” “LA” “MA” “MD” “ME” “MI” “MN” “MO” “MS” “MT” “NC” “ND” “NE” “NH” “NJ” “NM” “NV” “NY” “OH” “OK” “OR” “PA” “RI” “SC” “SD” “TN” “TX” “UT” “VA” “VT” “WA” “WI” “WV” “WY”
acceptable_states <- c('AK','AL','AR','AZ','CA','CO','CT','DC','DE','FL','GA','HI','IA','ID','IL','IN','KY','KS','LA','MA','MD','ME','MI','MN','MO','MS','MT','NC','ND','NE','NH','NJ','NM','NV','NY','OH','OK','OR','PA','RI','SC','SD','TN','TX','UT','VA','VT','WA','WI','WV','WY')
all_states <- unique(raw$state)
all_states[!(all_states %in% acceptable_states)]
## [1] "GU" "VI" "FSM" "MP" "AS" "PR" "NYC" "PW" "RMI"
data_new <- raw %>% # Replacing values
mutate(state = replace(state, state == 'NYC', 'NY'))
data_new
data_new <- subset(data_new, select = -c(created_at, consent_cases, consent_deaths) )
unique(data_new$state)
## [1] "ND" "MD" "ME" "NE" "IL" "NC" "VT" "MI" "NH" "NV" "DE" "GU"
## [13] "CT" "IN" "AL" "MO" "WI" "MS" "CA" "MT" "VI" "ID" "WA" "OR"
## [25] "FSM" "NJ" "DC" "MN" "AZ" "LA" "KY" "VA" "RI" "WY" "SC" "KS"
## [37] "UT" "AR" "MP" "AS" "HI" "AK" "PR" "OK" "NY" "PW" "GA" "TX"
## [49] "FL" "WV" "MA" "CO" "RMI" "IA" "TN" "OH" "NM" "PA" "SD"
data_new
data_new %>%
group_by(submission_date, state) %>%
summarise_all(sum)
I really didn’t like the approach of aggregating by season as it fails to take into account changes in the COVID situation due to time of year,
Spring-March 1st to May 31 Summer-June 1st to August 31st Fall-Sept 1 to November 30th Winter-Dec 1 to Feb 28/29
data_new$yearquarter = as.yearqtr(data_new$submission_date, format = "%m/%d/%Y")
data_new2 <- subset(data_new, select = -c(submission_date, state) )
data_new3 <- data_new2 %>%
group_by(yearquarter) %>%
summarise_all(sum)
data_new3
write.csv(data_new3,"Covid_data_year_quarter.csv", row.names = FALSE)
So we have our data saved and easily accessible. It is ready to be read into anything else, or picked up in R to analyze!